package com.meiyuetao.myt.crawl.filter;

import java.util.List;
import java.util.Map;
import java.util.Set;

import lab.s2jh.crawl.filter.ParseFilterChain;

import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.google.common.collect.Sets;

/**
 * 实现京东商分类商品抓取
 */
public class JdCategoryParseFilter extends AbstractCommodityParseFilter {

    private final Logger logger = LoggerFactory.getLogger(JdCategoryParseFilter.class);

    @Override
    public void doFilterInternal(String url, ParseFilterChain filterChain) {
        logger.debug("Invoking {} ...", this.getClass());

        // http://mall.jd.com/view_search-171887-0-0-1-24-1.html
        // http://mall.jd.com/view_search-171887-0-0-1-24-2.html

        if (url.indexOf("[") > -1 && url.indexOf("]") > -1) {
            String pageScope = StringUtils.substringBetween(url, "[", "]");
            int start = 1;
            int end = 1;
            if (pageScope.equals("*")) {
                start = 1;
                // 京东页面pageNo可以无限查询始终返回最后一页数据，因此需要特殊处理不支持*通配符，默认为第一页
                end = 1;
            } else {
                String[] splitPageNo = pageScope.split("-");
                if (splitPageNo[0].equals("*")) {
                    start = 1;
                } else {
                    start = Integer.valueOf(splitPageNo[0]);
                }

                if (splitPageNo[1].equals("*")) {
                    end = Integer.MAX_VALUE - 1;
                } else {
                    end = Integer.valueOf(splitPageNo[0]);
                }
            }
            String pageNoBefore = StringUtils.substringBefore(url, "[");
            String pageNoAfter = StringUtils.substringAfter(url, "]");
            for (int i = start; i < end + 1; i++) {
                String catPageUrl = pageNoBefore + (i) + pageNoAfter;
                if (parseCategoryPage(catPageUrl)) {
                    break;
                }
            }
        } else {
            parseCategoryPage(url);
        }
    }

    private boolean parseCategoryPage(String catPageUrl) {
        HtmlPage catPage = fetchHtmlPage(catPageUrl);
        logger.info("Process category page: " + catPage.getUrl());
        @SuppressWarnings("rawtypes")
        List links = catPage.getByXPath("//DIV[@class='jItem']//DIV[@class='jDesc']//A");
        if (links == null || links.size() == 0) {
            logger.info("No more valid links...");
            return true;
        } else {
            Set<String> pageUrls = Sets.newHashSet();
            for (Object element : links) {
                HtmlAnchor anchor = (HtmlAnchor) element;
                String href = anchor.getAttribute("href");
                logger.info("Found new valid url: {}", href);
                pageUrls.add(href);
            }
            crawlService.injectUrls(pageUrls.toArray(new String[] {}));
        }
        return false;
    }

    @Override
    public Map<String, Object> parseSimpleData(String url) {
        // TODO Auto-generated method stub
        return null;
    }
}
